InĀ [1]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import scvi
import os
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 50
DPI=300
FONTSIZE=20 #42
sc.settings.set_figure_params(scanpy = True, dpi=100, transparent=True, vector_friendly = True, dpi_save=DPI)
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
Global seed set to 0
InĀ [2]:
os.chdir('/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis')
DIR2SAVE = Path('/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis')
FIG2SAVE = DIR2SAVE.joinpath('Figures/')
FIG2SAVE
# set the global variable: sc.settings.figdir to save all plots
sc.settings.figdir = FIG2SAVE
InĀ [3]:
adata = sc.read_mtx('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated.mtx').T
adata
Out[3]:
AnnData object with n_obs Ć n_vars = 23119 Ć 36485
InĀ [4]:
features = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_genes.csv')
adata.var_names = features['x'].str.split('_', expand=True)[0]
barcodes = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_barcodes.csv')
adata.obs_names = barcodes['x']
meta = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_metadata.csv',
index_col=0)
meta = meta.drop(['nCount_ATAC', 'nFeature_ATAC'], axis=1) #drop Signac ATAC QC metrics - use metrics from ArchR
adata.obs = meta
adata
Out[4]:
AnnData object with n_obs Ć n_vars = 23119 Ć 36485
obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5'
InĀ [5]:
X_CCA = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated.CCA.csv', index_col=0)
X_CCA = X_CCA.to_numpy()
print(f"X_CCA shape: {np.shape(X_CCA)}")
adata.obsm['X_CCA'] = X_CCA
print(f"X_CCA: {adata.obsm['X_CCA']}")
X_CCA shape: (23119, 2) X_CCA: [[-12.20550625 3.08438743] [ 4.90263804 -2.15925419] [-11.58478538 3.74730027] ... [ -4.40180819 -13.12077558] [ 3.6492203 -1.49928909] [ -5.45846741 -13.2389425 ]]
InĀ [6]:
adata.obs.columns
Out[6]:
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient',
'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt',
'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters',
'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon',
'Cell_type_preDecon', 'ident', 'decontX_contamination',
'decontX_clusters', 'integratedRNADecon_snn_res.0.5'],
dtype='object')
InĀ [7]:
sc.pl.embedding(adata, 'X_CCA', color=['ELF3','COL3A1','CP','SLC11A1','CD79A','TRAC'], use_raw=False, ncols=6, vmax='p99', vmin=1, color_map='plasma_r')
InĀ [8]:
genes=['EPCAM','ELF3','HNF4A',
'SLC11A1','CD68','LYZ',
'CD79A','BANK1','BLK',
'CD3E','PTPRC','TRAC',
'ALB','CP','DEFB1',
'PECAM1','ARL15','VWF',
'DCN','ACTA2','COL3A1',
'PROX1']
sc.pl.embedding(adata, 'X_CCA', color=genes, use_raw=False, ncols=6, vmax='p99', vmin=1, color_map='plasma_r')
InĀ [9]:
adata.obs.Cell_type_preDecon.value_counts()
Out[9]:
Epithelial 17758 Myeloid 1900 T/NK/ILC 1600 Fibroblast 675 Endothelial 496 Hepatocytes 490 B 200 Name: Cell_type_preDecon, dtype: int64
InĀ [10]:
adata.obs.Sample.value_counts()
Out[10]:
CRC08_LM 4319 CRC09_LM 3268 CRC04_LM 2960 CRC10_LM 1980 CRC14_LM 1845 CRC15_LM 1408 CRC06_LM 1393 CRC11_LM 1373 CRC02_LM 975 CRC03_LM 790 CRC12_LM 735 CRC05_LM 708 CRC13_LM 533 CRC07_LM 464 CRC01_LM 368 Name: Sample, dtype: int64
InĀ [11]:
# keep raw decontaminated counts
adata.layers["raw"] = adata.X.copy() # preserve counts
# normalize + log1p
sc.pp.normalize_total(adata, target_sum=1e4, inplace=True)
adata.layers["normalised"] = adata.X.copy()
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()
adata.raw = adata # keep normalised log1p as .raw
InĀ [12]:
sc.pp.highly_variable_genes(adata,
subset=True, # subset for integration (but full lognorm data in .raw)
layer='raw',
flavor='seurat_v3',
n_top_genes=2000,
span=0.3,
min_disp=0.5,
min_mean=0.0125,
max_mean=3,
batch_key='Sample'
)
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scanpy/preprocessing/_highly_variable_genes.py:62: UserWarning: `flavor='seurat_v3'` expects raw count data, but non-integers were found. warnings.warn(
InĀ [13]:
print(adata.X.shape)
print(adata.raw.to_adata().X.shape)
(23119, 2000) (23119, 36485)
InĀ [14]:
for i in ['nCount_RNA', 'nFeature_RNA','percent.mt','percent.ribo', 'TSSEnrichment', 'nFrags', 'decontX_contamination']:
sc.pl.violin(adata, i, jitter=False, groupby='Sample', save='_'+i+'.pdf', rotation=90)
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nCount_RNA.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nFeature_RNA.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_percent.mt.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_percent.ribo.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_TSSEnrichment.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nFrags.pdf
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_decontX_contamination.pdf
InĀ [15]:
scvi.model.SCVI.setup_anndata(
adata,
layer="raw",
categorical_covariate_keys=["Sample"],
continuous_covariate_keys=["percent.mt", "nFeature_RNA"]
)
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/data/fields/_layer_field.py:78: UserWarning: adata.layers[raw] does not contain unnormalized count data. Are you sure this is what you want? warnings.warn(
InĀ [16]:
model = scvi.model.SCVI(adata, n_latent=10)
model
SCVI Model with the following params: n_hidden: 128, n_latent: 10, n_layers: 1, dropout_rate: 0.1, dispersion: gene, gene_likelihood: zinb, latent_distribution: normal Training status: Not Trained
Out[16]:
InĀ [17]:
model.train()
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Torchmetrics v0.9 introduced a new argument class property called `full_state_update` that has
not been set for this class (ElboMetric). The property determines if `update` by
default needs access to the full metric state. If this is not the case, significant speedups can be
achieved and we recommend setting this to `False`.
We provide an checking function
`from torchmetrics.utilities import check_forward_no_full_state`
that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
default for now) or if `full_state_update=False` can be used safely.
warnings.warn(*args, **kwargs)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Epoch 1/346: 0%| | 0/346 [00:00<?, ?it/s]
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/distributions/_negative_binomial.py:436: UserWarning: The value argument must be within the support of the distribution warnings.warn(
Epoch 346/346: 100%|āāāāāāāāāā| 346/346 [12:37<00:00, 2.19s/it, loss=776, v_num=1]
InĀ [18]:
# save model
import os
model.save(os.path.join(DIR2SAVE,'scvi_model_hvg_mt'))
InĀ [19]:
# read the model
model = scvi.model.SCVI.load(os.path.join(DIR2SAVE,'scvi_model_hvg_mt'), adata=adata)#, use_gpu=True)
INFO File /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/scvi_model_hvg_mt /model.pt already downloaded
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/data/fields/_layer_field.py:78: UserWarning: adata.layers[raw] does not contain unnormalized count data. Are you sure this is what you want? warnings.warn(
InĀ [20]:
latent = model.get_latent_representation() # get model output
InĀ [21]:
print(latent.shape)
# Itās often useful to store the outputs of scvi-tools back into the original anndata, as it permits interoperability with
# Scanpy.
adata.obsm["X_scVI"] = latent
# Letās store the normalized values back in the anndata.
adata.layers["scvi_normalized"] = model.get_normalized_expression(adata=adata, library_size=1e4)
print("Number of PCs:", latent.shape[1])
# use scVI latent space for UMAP generation
# set nb pcs to latent representation shape of scvi
# compute neighbourhood graph
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=latent.shape[1], knn=True, method='umap', metric='euclidean',
use_rep="X_scVI", random_state=7)
# compute UMAP embedding
sc.tl.umap(adata, min_dist=0.3, n_components=2, random_state=7)
(23119, 10) Number of PCs: 10
InĀ [22]:
sc.pl.umap(adata, color='Sample')
adata.obs['Cell_type_preDecon'] = adata.obs['Cell_type_preDecon'].astype("category")
sc.pl.umap(adata, color='Cell_type_preDecon')
InĀ [23]:
sc.pl.umap(adata, color=['nCount_RNA', 'nFeature_RNA','percent.mt','percent.ribo', 'TSSEnrichment', 'nFrags',
'decontX_contamination'],
ncols=4,color_map='viridis_r', vmax='p99',
save='_allCells_scVI_QC.pdf')
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/umap_allCells_scVI_QC.pdf
InĀ [24]:
genes=['EPCAM','ELF3','HNF4A',
'SLC11A1','CD68','LYZ',
'CD79A','BANK1','BLK',
'CD3E','PTPRC','TRAC',
'ALB','CP','DEFB1',
'PECAM1','ARL15','VWF',
'DCN','ACTA2','COL3A1',
'PROX1']
sc.pl.umap(adata, color=genes, use_raw=True, ncols=6, vmax='p99', vmin=1, color_map='plasma_r',
save='_allCells_scVI_markers.pdf')
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/umap_allCells_scVI_markers.pdf
InĀ [25]:
sc.tl.leiden(adata, resolution=0.5)
sc.pl.umap(adata, color='leiden')
sc.pl.umap(adata, color='leiden', legend_loc = 'on data')
InĀ [26]:
### Find marker genes
#del adata.uns['log1p']
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon',pts=True, use_raw=True)
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False)
InĀ [27]:
pval_thresh = 0.05
log2fc_thresh = 0.25
pct_cutoff = 0.1
cluster_de_genes = dict()
for cluster in sorted(set(adata.obs['leiden'])):
cluster_de_genes[cluster] = sc.get.rank_genes_groups_df(adata,
group=cluster,
key='rank_genes_groups',
pval_cutoff=pval_thresh,
log2fc_min=log2fc_thresh,
log2fc_max=None).sort_values('logfoldchanges',
ascending=False)
cluster_de_genes[cluster] = cluster_de_genes[cluster][cluster_de_genes[cluster]['pct_nz_group'] > pct_cutoff]
with pd.ExcelWriter('allCells_leiden_wilcoxon.xls') as writer:
for cluster in list(cluster_de_genes.keys()):
cluster_de_genes[cluster].to_excel(writer, sheet_name='cluster{}'.format(cluster))
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3338: FutureWarning: As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead. You can set the option io.excel.xls.writer to 'xlwt' to silence this warning. While this option is deprecated and will also raise a warning, it can be globally set and the warning suppressed. if await self.run_code(code, result, async_=asy):
InĀ [28]:
adata.write('AllCells_scVI.h5ad')
InĀ [3]:
adata = sc.read('AllCells_scVI.h5ad')
adata
Out[3]:
AnnData object with n_obs Ć n_vars = 23119 Ć 2000
obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5', '_scvi_batch', '_scvi_labels', 'leiden'
var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
uns: 'Cell_type_preDecon_colors', 'Sample_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'rank_genes_groups', 'umap'
obsm: 'X_CCA', 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
layers: 'log1p', 'normalised', 'raw', 'scvi_normalized'
obsp: 'connectivities', 'distances'
InĀ [5]:
old_to_new = {'0':'Epithelial',
'1':'Epithelial',
'2':'Epithelial',
'3':'Epithelial',
'4':'Myeloid',
'5':'Epithelial',
'6':'T/NK/ILC',
'7':'Fibroblast',
'8':'Hepatocyte',
'9':'Endothelial',
'10':'Epithelial',
'11':'B',
'12':'Endothelial'
}
adata.obs['Cell_type'] = (adata.obs['leiden'].map(old_to_new).astype('category'))
sc.pl.umap(adata, color=['Cell_type','Cell_type_preDecon'])
InĀ [6]:
adata.write('CRCLM_decon_scvi.h5ad')
np.savetxt('CRCLM_decon_scvi_XscVI.csv', adata.obsm['X_scVI'], delimiter=',')
np.savetxt('CRCLM_decon_scvi_Xumap.csv', adata.obsm['X_umap'], delimiter=',')
adata.obs.to_csv('CRCLM_decon_scvi_metadata.csv')